---
title: "Covid-19 vaccine data discourse analysis"
output: html_notebook
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 

Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. 

```{r}
library(ggplot2)
library(ggpubr)
library(texreg)
library(dplyr)
```

```{r}
#Helper functions 

#replace NAs with median
ReplaceNAs <- function(d) {
  d$account.follower.count[is.na(d$account.follower.count)] <-  median(d$account.follower.count, na.rm=TRUE)
  
  d$account.tweet.count[is.na(d$account.tweet.count)] <- median(d$account.tweet.count, na.rm=TRUE)
  
  return(d)
}

#visualize prototypical plots
VisProto <- function(proto_data, model, y_lab) {
  tgc = data.frame(
    predict(model, newdata=proto_data, type="response",se.fit=TRUE)
    )
  tgc$type = c('Anti-vaccine','Pro-vaccine')

  p <-ggplot(
    data=tgc, 
    aes(x=type, y=fit, group=1)) +
  geom_point(shape=1,
             size=10, 
             stroke = 2) +
  geom_errorbar(aes(ymin=fit-1.96*se.fit, ymax=fit+1.96*se.fit),
                  width=.15,
                  size = 1,# Width of the error bars
                  position=position_dodge(.9)) + 
  #ylim(-0.001, 0.02) + 
  xlab("Tweet type") + 
  ylab(y_lab) +
  theme(axis.title.x=element_text(size=32),
        axis.title.y=element_text(size=32),
        axis.text.x=element_text(size=28),
        axis.text.y=element_text(size=28))
  
  return(p)
}

```

```{r}
#load data
d.auth <- read.csv('authority_analysis.csv')
d.assert <- read.csv('certainty_analysis.csv')
d.causal <- read.csv('causal_claims_analysis.csv')
```
```{r}
# processing data

#add a column describing if any authority figures are meantioned to the authority dataset
d.auth$mentioned.authority <- as.integer(as.logical(d.auth$mentioned.researcher + d.auth$mentioned.politician + d.auth$mentioned.physician))

#replace the NAs in account follower and tweet count with median
figures <-ReplaceNAs(d.auth)

certainty <-ReplaceNAs(d.assert)

causal.claims <-ReplaceNAs(d.causal)

```

```{r}
#Hypothesis 1: Mention of authority figures
#The following results are those of the models M1, M1.1, M1.2, M1.3 in Table 3

#Fitting models
#M1
m1 <- glm(mentioned.authority ~ type + log1p(account.tweet.count) + log1p(account.follower.count) + log1p(tweet.length), data = figures, family = binomial)

#M1.1
m1.1 <- glm(mentioned.researcher ~ type + log1p(account.tweet.count) + log1p(account.follower.count) + log1p(tweet.length), data = figures, family = binomial)

#M1.2
m1.2 <- glm(mentioned.politician ~ type + log1p(account.tweet.count) + log1p(account.follower.count) + log1p(tweet.length), data = figures, family = binomial)

#M1.3
m1.3 <- glm(mentioned.physician ~ type + log1p(account.tweet.count) + log1p(account.follower.count) + log1p(tweet.length), data = figures, family = binomial)

#show model results
summary(m1)
summary(m1.1)
summary(m1.2)
summary(m1.3)


#Visualizing prototypical results

#build prototypical data
proto.data.m1 <- data.frame(
  type = c('anti','pro'),
  account.tweet.count = median(figures$account.tweet.count),
  account.follower.count = median(figures$account.follower.count),
  tweet.length = median(figures$tweet.length)
)

#build plots
p.m1 <- VisProto(proto.data.m1, m1, "Probability of mentioning \n authority figure in the Tweet")

p.m1.1 <- VisProto(proto.data.m1, m1.1, "Probability of mentioning \n researcher in the Tweet")

p.m1.2 <- VisProto(proto.data.m1, m1.2, "Probability of mentioning \n politician in the Tweet")

p.m1.3 <- VisProto(proto.data.m1, m1.3, "Probability of mentioning \n physician in the Tweet")

#show plots
p.m1
p.m1.1
p.m1.2
p.m1.3
```

```{r}
#Hypothesis 2: Certainty
#The following results are those of the models M2 in Table 4

#Fitting models
#M2
m2 <- glm(is.certain ~ type + log1p(account.tweet.count) + log1p(account.follower.count) + log1p(tweet.length), data = certainty, family = binomial)

#show model results
summary(m2)


#Visualizing prototypical results

#build prototypical data
proto.data.m2 <- data.frame(
  type = c('anti','pro'),
  account.tweet.count = median(certainty$account.tweet.count),
  account.follower.count = median(certainty$account.follower.count),
  tweet.length = median(certainty$tweet.length)
)

#build plots
p.m2 <- VisProto(proto.data.m2, m2, "Probability of expressing certainty in the Tweet")

#show plots
p.m2
```

```{r}
#Hypothesis 3: Causal claims
#The following results are those of the models M3 in Table 5

#Fitting models
#M3
m3 <- glm(is.causal ~ type + log1p(account.tweet.count) + log1p(account.follower.count) + log1p(tweet.length), data = causal.claims, family = binomial)

#show model results
summary(m3)


#Visualizing prototypical results

#build prototypical data
proto.data.m3 <- data.frame(
  type = c('anti','pro'),
  account.tweet.count = median(causal.claims$account.tweet.count),
  account.follower.count = median(causal.claims$account.follower.count),
  tweet.length = median(causal.claims$tweet.length)
)

#build plots
p.m3 <- VisProto(proto.data.m3, m3, "Probability of including causal claims in the Tweet")

#show plots
p.m3
```
```{r}
texreg(list(m1, m1.1, m1.2, m1.3, m2, m3), 
       custom.model.names = c("Mentioned\\\\authority?\\\\(M1)", "Mentioned\\\\researcher?\\\\(M1.1)", "Mentioned\\\\politician?\\\\(M1.2)", "Mentioned\\\\physician?\\\\(M1.3)", "Is.certain?\\\\(M2)", "Is.causal?\\\\(M3)"),
       custom.coef.names = c("(Intercept)","Is.pro-vaccine?", "log1p(Account.tweet.count)", "log1p(Account.follower.count)", "log1p(Tweet.length)"),
       digits = 2, 
       table = FALSE)

```

```{r}
tgc1 = data.frame(
    predict(m1, newdata=proto.data.m1, type="response",se.fit=TRUE)
    )
tgc1$type = c('Anti-vaccine','Pro-vaccine')
  
tgc1.1 = data.frame(
    predict(m1.1, newdata=proto.data.m1, type="response",se.fit=TRUE)
    )
tgc1.1$type = c('Anti-vaccine','Pro-vaccine')

tgc1.2 = data.frame(
    predict(m1.2, newdata=proto.data.m1, type="response",se.fit=TRUE)
    )
tgc1.2$type = c('Anti-vaccine','Pro-vaccine')

tgc1.3 = data.frame(
    predict(m1.3, newdata=proto.data.m1, type="response",se.fit=TRUE)
    )
tgc1.3$type = c('Anti-vaccine','Pro-vaccine')

tgc2 = data.frame(
    predict(m2, newdata=proto.data.m2, type="response",se.fit=TRUE)
    )
tgc2$type = c('Anti-vaccine','Pro-vaccine')

tgc3 = data.frame(
    predict(m3, newdata=proto.data.m3, type="response",se.fit=TRUE)
    )
tgc3$type = c('Anti-vaccine','Pro-vaccine')

combined_df <- rbind(tgc1, tgc1.1, tgc1.2, tgc1.3, tgc2, tgc3)
```


```{r}
#regression visualization
library(ggplot2)
library(dplyr)

CreateProtoData <- function(d) {
  df <- data.frame(
    type = c('anti','pro'),
    account.tweet.count = median(d$account.tweet.count),
    account.follower.count = median(d$account.follower.count),
    tweet.length = median(d$tweet.length)
  )
  return(df)
}

FitModel <- function(protodata, model){
  tgc = data.frame(
    predict(model, newdata=protodata, type="response", se.fit=TRUE)
  )
  tgc$type = c('Anti-vaccine', 'Pro-vaccine')
  return(tgc)
}

# Generate prototype data
proto.data <- CreateProtoData(certainty)

# Fit models
tgc1 <- FitModel(proto.data, m1)
tgc1.1 <- FitModel(proto.data, m1.1)
tgc1.2 <- FitModel(proto.data, m1.2)
tgc1.3 <- FitModel(proto.data, m1.3)
tgc2 <- FitModel(proto.data, m2)
tgc3 <- FitModel(proto.data, m3)

# Combine data into a single dataframe
combined_df <- bind_rows(tgc1 %>% mutate(x = 1),
                         tgc1.1 %>% mutate(x = 2),
                         tgc1.2 %>% mutate(x = 3),
                         tgc1.3 %>% mutate(x = 4), 
                         tgc2 %>% mutate(x = 5),
                         tgc3 %>% mutate(x = 6))

# Create updated plot
ggplot(combined_df, aes(x = as.factor(x), y = fit, shape = type, color = type)) +
  geom_point(size = 2, stroke = 1, position = position_dodge(width = 0.5)) +
  geom_errorbar(aes(ymin = fit - 1.96 * se.fit, ymax = fit + 1.96 * se.fit), 
                width = 0.2, size = 0.5, position = position_dodge(width = 0.5)) +
  scale_x_discrete(labels = c(
    "(M1) \n Mentions \n any authorities", 
    "(M1.1) \n Mentions \n researchers", 
    "(M1.2) \n Mentions \n politicians", 
    "(M1.3) \n Mentions \n physicians", 
    "(M2) \n Expresses \n certainty", 
    "(M3) \n Contains \n causal claims"
  )) + 
  scale_y_continuous(breaks = seq(0, 0.5, 0.1)) +  
  scale_color_manual(values = c("Anti-vaccine" = "#f77454", "Pro-vaccine" = "#27c064")) +  
  xlab("Textual features") + 
  ylab("Predicted probability of textual feature in a tweet \n") + 
  theme_minimal() +  
  theme(
    axis.text.x = element_text(angle = 45, size = 9, hjust = 1),
    axis.text.y = element_text(size = 10),
    axis.title.x = element_text(size = 10),
    axis.title.y = element_text(size = 10),
    legend.position = c(0.15, 0.85),  
    legend.justification = c(0, 1),
    legend.background = element_rect(fill = "white", color = "black"),
    legend.text = element_text(size = 9)
  )
```

```{r}
library(ggplot2)
library(dplyr)

# Get actual sample sizes
n_anti_cert <- sum(certainty$type == 'anti')
n_pro_cert <- sum(certainty$type == 'pro')

n_anti_causal <- sum(causal.claims$type == 'anti')
n_pro_causal <- sum(causal.claims$type == 'pro')

n_anti_auth <- sum(figures$type == 'anti')
n_pro_auth <- sum(figures$type == 'pro')

n_anti_res <- sum(figures$type == 'anti')
n_pro_res <- sum(figures$type == 'pro')

n_anti_pol <- sum(figures$type == 'anti')
n_pro_pol <- sum(figures$type == 'pro')

n_anti_phy <- sum(figures$type == 'anti')
n_pro_phy <- sum(figures$type == 'pro')

# Function to compute 95% CI for proportions
compute_ci <- function(mean, n) {
  se <- sqrt((mean * (1 - mean)) / n)  # Standard error for proportion
  lower <- mean - 1.96 * se
  upper <- mean + 1.96 * se
  return(c(lower, upper))
}


d.cert <- data.frame(
  Type = c('Anti-vaccine', 'Pro-vaccine'),
  Mean = c(mean(certainty[certainty$type == 'anti', ]$is.certain), mean(certainty[certainty$type == 'pro', ]$is.certain))
) %>%
  mutate(n = c(n_anti_cert, n_pro_cert),
         CI = mapply(compute_ci, Mean, n),
         Lower = CI[1,], Upper = CI[2,]) %>%
  select(-CI)

d.causal <- data.frame(
  Type = c('Anti-vaccine', 'Pro-vaccine'),
  Mean = c(mean(causal.claims[causal.claims$type == 'anti', ]$is.causal), mean(causal.claims[causal.claims$type == 'pro', ]$is.causal))
) %>%
  mutate(n = c(n_anti_causal, n_pro_causal),
         CI = mapply(compute_ci, Mean, n),
         Lower = CI[1,], Upper = CI[2,]) %>%
  select(-CI)

d.auth <- data.frame(
  Type = c('Anti-vaccine', 'Pro-vaccine'),
  Mean = c(mean(figures[figures$type == 'anti', ]$figure), mean(figures[figures$type == 'pro', ]$figure))
) %>%
  mutate(n = c(n_anti_auth, n_pro_auth),
         CI = mapply(compute_ci, Mean, n),
         Lower = CI[1,], Upper = CI[2,]) %>%
  select(-CI)

d.res <- data.frame(
  Type = c('Anti-vaccine', 'Pro-vaccine'),
  Mean = c(mean(figures[figures$type == 'anti', ]$mentioned.researcher), mean(figures[figures$type == 'pro', ]$mentioned.researcher))
) %>%
  mutate(n = c(n_anti_res, n_pro_res),
         CI = mapply(compute_ci, Mean, n),
         Lower = CI[1,], Upper = CI[2,]) %>%
  select(-CI)

d.pol <- data.frame(
  Type = c('Anti-vaccine', 'Pro-vaccine'),
  Mean = c(mean(figures[figures$type == 'anti', ]$mentioned.politician), mean(figures[figures$type == 'pro', ]$mentioned.politician))
) %>%
  mutate(n = c(n_anti_pol, n_pro_pol),
         CI = mapply(compute_ci, Mean, n),
         Lower = CI[1,], Upper = CI[2,]) %>%
  select(-CI)

d.phy <- data.frame(
  Type = c('Anti-vaccine', 'Pro-vaccine'),
  Mean = c(mean(figures[figures$type == 'anti', ]$mentioned.physician), mean(figures[figures$type == 'pro', ]$mentioned.physician))
) %>%
  mutate(n = c(n_anti_phy, n_pro_phy),
         CI = mapply(compute_ci, Mean, n),
         Lower = CI[1,], Upper = CI[2,]) %>%
  select(-CI)

# Combine datasets with a categorical x-axis
combined_d <- bind_rows(d.auth %>% mutate(x = 1),
                         d.res %>% mutate(x = 2),
                         d.pol %>% mutate(x = 3),
                         d.phy %>% mutate(x = 4), 
                         d.cert %>% mutate(x = 5),
                         d.causal %>% mutate(x = 6))

# Create plot with error bars for 95% CI
ggplot(combined_d, aes(x = as.factor(x), y = Mean, fill = Type)) +
  geom_bar(position = position_dodge(width = 0.9), stat = "identity") +
  geom_errorbar(aes(ymin = Lower, ymax = Upper), 
                position = position_dodge(width = 0.9), width = 0.2, color = "black") +  # 95% CI
  scale_x_discrete(labels = c("Mentions \n any authorities", "Mentions \n researchers", "Mentions \n politicians", "Mentions \n physicians", "Expresses \n certainty", "Contains \n causal claims")) + 
  scale_y_continuous(breaks = seq(0, 0.5, 0.1)) +  
  xlab("Textual features") + 
  ylab("Proportion of tweets\n") + 
  theme_minimal() +  # Cleaner theme
  theme(axis.text.x = element_text(angle = 45, size = 9, hjust = 1),
        legend.position = c(0.15, 0.85),  
        legend.justification = c(0, 1),
        legend.background = element_rect(fill = "white", color = "black"),
        legend.text = element_text(size = 9),
        axis.title.x = element_text(size = 10),
        axis.title.y = element_text(size = 10),
        axis.text.y = element_text(size = 9)) +
  scale_fill_manual(values = c("Anti-vaccine" = "#f77454", "Pro-vaccine" = "#27c064")) 
```

